import numpy as np
import pandas as pd
import seaborn as sns #数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
from ydata_profiling import ProfileReport #数据EDA交互工具 EDA(Exploratory Data Analysis)数据探索性分析
from sklearn.model_selection import train_test_split #切割数据集
from sklearn.linear_model import LogisticRegression #逻辑回归
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.svm import SVC, LinearSVC #支持向量机
from sklearn.neighbors import KNeighborsClassifier #k近邻
from sklearn.naive_bayes import GaussianNB #朴素贝叶斯
from sklearn.linear_model import Perceptron #感知机
from sklearn.linear_model import SGDClassifier #随机梯度下降
from sklearn.tree import DecisionTreeClassifier #决策树
from sklearn.preprocessing import LabelEncoder #分类变量编码
from sklearn.model_selection import cross_val_score #模型评估
df = pd.read_csv('data/train.csv')
# 查看缺失值
df.isna().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
# 对连续变量进行填充 用均值填充
df['Age'] = df['Age'].fillna(df['Age'].mean())
# 查看数据分布情况
df['Cabin'].value_counts()
B96 B98 4
G6 4
C23 C25 C27 4
C22 C26 3
F33 3
..
E34 1
C7 1
C54 1
E36 1
C148 1
Name: Cabin, Length: 147, dtype: int64
df['Embarked'].value_counts()
S 644 C 168 Q 77 Name: Embarked, dtype: int64
# 对分类变量进行填充 填充某个缺失值字符(NA)、用最多类别的进行填充
df['Cabin'] = df['Cabin'].fillna('NA')
df['Embarked'] = df['Embarked'].fillna('S')
# 查看重复值个数
df.duplicated().sum()
0
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
combine = [train_df, test_df]
查看数据整体分布情况
train_df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
train_df.describe()
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
train_df.describe(include=['O'])
| Name | Sex | Ticket | Cabin | Embarked | |
|---|---|---|---|---|---|
| count | 891 | 891 | 891 | 204 | 889 |
| unique | 891 | 2 | 681 | 147 | 3 |
| top | Braund, Mr. Owen Harris | male | 347082 | B96 B98 | S |
| freq | 1 | 577 | 7 | 4 | 644 |
print(train_df.isna().sum())
print('_'*40)
print(test_df.isna().sum())
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64 ________________________________________ PassengerId 0 Pclass 0 Name 0 Sex 0 Age 86 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 327 Embarked 0 dtype: int64
grid = sns.FacetGrid(train_df, row='Pclass', col='Sex', height=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()
<seaborn.axisgrid.FacetGrid at 0x22eab933160>
for dataset in combine:
dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
guess_ages = np.zeros((2,3))
for dataset in combine:
for i in range(0, 2):
for j in range(0, 3):
guess_df = dataset[(dataset['Sex'] == i) & \
(dataset['Pclass'] == j+1)]['Age'].dropna()
age_guess = guess_df.median()
# Convert random age float to nearest .5 age
guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
for i in range(0, 2):
for j in range(0, 3):
dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
'Age'] = guess_ages[i,j]
dataset['Age'] = dataset['Age'].astype(int)
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)
| AgeBand | Survived | |
|---|---|---|
| 0 | (-0.08, 16.0] | 0.550000 |
| 1 | (16.0, 32.0] | 0.337374 |
| 2 | (32.0, 48.0] | 0.412037 |
| 3 | (48.0, 64.0] | 0.434783 |
| 4 | (64.0, 80.0] | 0.090909 |
train_df.Fare
0 7.2500
1 71.2833
2 7.9250
3 53.1000
4 8.0500
...
886 13.0000
887 30.0000
888 23.4500
889 30.0000
890 7.7500
Name: Fare, Length: 891, dtype: float64
df.head(3)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NA | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NA | S |
df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
df.groupby(['Pclass','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
<Axes: xlabel='Pclass'>
df.groupby(['Sex','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
<Axes: xlabel='Sex'>
df.groupby(['SibSp','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
<Axes: xlabel='SibSp'>
df.groupby(['Parch','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
<Axes: xlabel='Parch'>
df.groupby(['Embarked','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
<Axes: xlabel='Embarked'>
g = sns.FacetGrid(df, col='Survived')
g.map(plt.hist, 'Age', bins=20)
<seaborn.axisgrid.FacetGrid at 0x22ea580afb0>
facet = sns.FacetGrid(df, hue="Survived",aspect=3)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, df['Age'].max()))
facet.add_legend()
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. func(*plot_args, **plot_kwargs) c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. func(*plot_args, **plot_kwargs)
<seaborn.axisgrid.FacetGrid at 0x22e841f6a10>
facet = sns.FacetGrid(df, hue="Survived",aspect=3)
facet.map(sns.kdeplot,'Fare',shade= True)
facet.set(xlim=(0, df['Fare'].max()))
facet.add_legend()
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. func(*plot_args, **plot_kwargs) c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. func(*plot_args, **plot_kwargs)
<seaborn.axisgrid.FacetGrid at 0x1f39775c4f0>
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
C:\Users\XLL\AppData\Local\Temp\ipykernel_11864\2675011510.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
<Axes: >
profile = ProfileReport(df)
profile
Summarize dataset: 100%|██████████| 47/47 [00:05<00:00, 7.95it/s, Completed] Generate report structure: 100%|██████████| 1/1 [00:05<00:00, 5.44s/it] Render HTML: 100%|██████████| 1/1 [00:01<00:00, 1.58s/it]
for dataset in combine:
dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(train_df['Title'], train_df['Sex'])
| Sex | female | male |
|---|---|---|
| Title | ||
| Capt | 0 | 1 |
| Col | 0 | 2 |
| Countess | 1 | 0 |
| Don | 0 | 1 |
| Dr | 1 | 6 |
| Jonkheer | 0 | 1 |
| Lady | 1 | 0 |
| Major | 0 | 2 |
| Master | 0 | 40 |
| Miss | 182 | 0 |
| Mlle | 2 | 0 |
| Mme | 1 | 0 |
| Mr | 0 | 517 |
| Mrs | 125 | 0 |
| Ms | 1 | 0 |
| Rev | 0 | 6 |
| Sir | 0 | 1 |
for dataset in combine:
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
| Title | Survived | |
|---|---|---|
| 0 | Master | 0.575000 |
| 1 | Miss | 0.702703 |
| 2 | Mr | 0.156673 |
| 3 | Mrs | 0.793651 |
| 4 | Rare | 0.347826 |
for dataset in combine:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
| FamilySize | Survived | |
|---|---|---|
| 3 | 4 | 0.724138 |
| 2 | 3 | 0.578431 |
| 1 | 2 | 0.552795 |
| 6 | 7 | 0.333333 |
| 0 | 1 | 0.303538 |
| 4 | 5 | 0.200000 |
| 5 | 6 | 0.136364 |
| 7 | 8 | 0.000000 |
| 8 | 11 | 0.000000 |
for dataset in combine:
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
| IsAlone | Survived | |
|---|---|---|
| 0 | 0 | 0.505650 |
| 1 | 1 | 0.303538 |
def model_score(X,y):
# 对数据集进行切割
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
# 查看训练集和测试集score值
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Testing set score: {:.2f}".format(lr.score(X_test, y_test)))
# 取出所有的输入特征
data = df[['Pclass','Sex','Age','SibSp','Parch','Fare', 'Embarked']]
# 进行虚拟变量转换
data = pd.get_dummies(data)
data.head(1)
| Pclass | Age | SibSp | Parch | Fare | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 22.0 | 1 | 0 | 7.25 | 0 | 1 | 0 | 0 | 1 |
model_score(data,df['Survived'])
Training set score: 0.80 Testing set score: 0.78
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
df = pd.read_csv('data/train_1.csv')
#分类变量编码
for feat in ['Sex', 'Embarked']:
lbl = LabelEncoder()
label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
df[feat + "_labelEncode"] = df[feat].map(label_dict)
df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
# 取出所有的输入特征
data = df[['Pclass','Sex_labelEncode','Age','SibSp','Parch','Fare', 'Embarked_labelEncode']]
data.head(1)
| Pclass | Sex_labelEncode | Age | SibSp | Parch | Fare | Embarked_labelEncode | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 1 | 22.0 | 1 | 0 | 7.25 | 2 |
model_score(data,df['Survived'])
Training set score: 0.81 Testing set score: 0.79
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
df = pd.read_csv('data/train_2.csv')
df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])
df.groupby(['AgeBand','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
<Axes: xlabel='AgeBand'>
df['Fare'].describe()
count 891.000000 mean 32.204208 std 49.693429 min 0.000000 25% 7.910400 50% 14.454200 75% 31.000000 max 512.329200 Name: Fare, dtype: float64
df['FareBand'] = pd.cut(df['Fare'],[0,8,16,32,100,513],labels = [1,2,3,4,5])
df.groupby(['FareBand','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
<Axes: xlabel='FareBand'>
data = df[['Pclass','Sex_labelEncode','AgeBand','SibSp','Parch','FareBand', 'Embarked_labelEncode']]
data.head(1)
| Pclass | Sex_labelEncode | AgeBand | SibSp | Parch | FareBand | Embarked_labelEncode | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 1 | 2 | 1 | 0 | 1 | 2 |
model_score(data,df['Survived'])
Training set score: 0.62 Testing set score: 0.61
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arrays\base.py:513: RuntimeWarning: invalid value encountered in cast
result = np.asarray(self, dtype=dtype)
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=2):
ABNORMAL_TERMINATION_IN_LNSRCH.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arrays\base.py:513: RuntimeWarning: invalid value encountered in cast
result = np.asarray(self, dtype=dtype)
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arrays\base.py:513: RuntimeWarning: invalid value encountered in cast
result = np.asarray(self, dtype=dtype)
def preprocessing(df):
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df['Cabin'] = df['Cabin'].fillna('NA')
df['Embarked'] = df['Embarked'].fillna('S')
for feat in ['Sex', 'Embarked']:
lbl = LabelEncoder()
label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
df[feat + "_labelEncode"] = df[feat].map(label_dict)
df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
data = df[['Pclass','Sex_labelEncode','Age','SibSp','Parch','Fare', 'Embarked_labelEncode']]
return data
def training():
df = pd.read_csv('data/train.csv')
data = preprocessing(df)
X=data
y=df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Testing set score: {:.2f}".format(lr.score(X_test, y_test)))
return lr
lr = training()
Training set score: 0.81 Testing set score: 0.79
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
test = pd.read_csv('data/test.csv')
test_data = preprocessing(test)
pred = lr.predict(test_data)
result = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
result.to_csv('data/result.csv',index=False)
def preprocessing(df):
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df['Cabin'] = df['Cabin'].fillna('NA')
df['Embarked'] = df['Embarked'].fillna('S')
for feat in ['Sex', 'Embarked']:
lbl = LabelEncoder()
label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
df[feat + "_labelEncode"] = df[feat].map(label_dict)
df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])
data = df[['Pclass','Sex_labelEncode','AgeBand','SibSp','Parch','Fare', 'Embarked_labelEncode']]
return data
def training():
df = pd.read_csv('data/train.csv')
data = preprocessing(df)
X=data
y=df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Testing set score: {:.2f}".format(lr.score(X_test, y_test)))
return lr
lr = training()
Training set score: 0.80 Testing set score: 0.78
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
test = pd.read_csv('data/test.csv')
test_data = preprocessing(test)
pred = lr.predict(test_data)
result = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
result.to_csv('data/result.csv',index=False)
def preprocessing(df):
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Cabin'] = df['Cabin'].fillna('NA')
df['Embarked'] = df['Embarked'].fillna('S')
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
for feat in ['Sex', 'Embarked', 'Title']:
lbl = LabelEncoder()
label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
df[feat] = df[feat].map(label_dict)
df[feat] = lbl.fit_transform(df[feat].astype(str))
guess_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
guess_df = df[(df['Sex'] == i) & \
(df['Pclass'] == j+1)]['Age'].dropna()
age_guess = guess_df.median()
guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
for i in range(0, 2):
for j in range(0, 3):
df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
'Age'] = guess_ages[i,j]
df['Age'] = df['Age'].astype(int)
df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])
data = df[['Pclass','Sex','AgeBand','SibSp','Parch','Fare','Embarked','Title','IsAlone']]
return data
def training():
df = pd.read_csv('data/train.csv')
data = preprocessing(df)
X=data
y=df['Survived']
lr = LogisticRegression()
lr.fit(X,y)
print("Training set score: {:.2f}".format(lr.score(X,y)))
return lr
lr = training()
Training set score: 0.81
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
test = pd.read_csv('data/test.csv')
test_data = preprocessing(test)
pred = lr.predict(test_data)
result = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
result.to_csv('data/result.csv',index=False)
def preprocessing(df):
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Cabin'] = df['Cabin'].fillna('NA')
df['Embarked'] = df['Embarked'].fillna('S')
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
for feat in ['Sex', 'Embarked', 'Title']:
lbl = LabelEncoder()
label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
df[feat] = df[feat].map(label_dict)
df[feat] = lbl.fit_transform(df[feat].astype(str))
guess_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
guess_df = df[(df['Sex'] == i) & \
(df['Pclass'] == j+1)]['Age'].dropna()
age_guess = guess_df.median()
guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
for i in range(0, 2):
for j in range(0, 3):
df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
'Age'] = guess_ages[i,j]
df['Age'] = df['Age'].astype(int)
df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])
df['Age*Class'] = df.Age * df.Pclass
data = df[['Pclass','Sex','AgeBand','SibSp','Parch','Fare','Embarked','Title','IsAlone','Age*Class']]
return data
def training():
df = pd.read_csv('data/train.csv')
data = preprocessing(df)
X=data
y=df['Survived']
lr = RandomForestClassifier()
lr.fit(X, y)
print(cross_val_score(lr, X, y, cv=10).mean())
return lr
lr = training()
0.8159925093632958
test = pd.read_csv('data/test.csv')
test_data = preprocessing(test)
pred = lr.predict(test_data)
result = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
result.to_csv('data/result.csv',index=False)
def preprocessing(df):
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Cabin'] = df['Cabin'].fillna('NA')
df['Embarked'] = df['Embarked'].fillna('S')
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
for feat in ['Sex', 'Embarked', 'Title']:
lbl = LabelEncoder()
label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
df[feat] = df[feat].map(label_dict)
df[feat] = lbl.fit_transform(df[feat].astype(str))
guess_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
guess_df = df[(df['Sex'] == i) & \
(df['Pclass'] == j+1)]['Age'].dropna()
age_guess = guess_df.median()
guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
for i in range(0, 2):
for j in range(0, 3):
df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
'Age'] = guess_ages[i,j]
df['Age'] = df['Age'].astype(int)
df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])
df['Age*Class'] = df.Age * df.Pclass
data = df[['Pclass','Sex','AgeBand','SibSp','Parch','Fare','Embarked','Title','IsAlone','Age*Class']]
return data
scores = pd.Series()
C:\Users\XLL\AppData\Local\Temp\ipykernel_772\3866007741.py:1: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. scores = pd.Series()
df = pd.read_csv('data/train.csv')
data = preprocessing(df)
X=data
y=df['Survived']
lr = LogisticRegression()
lr.fit(X, y)
scores['LogisticRegression'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
lr = SVC()
lr.fit(X, y)
scores['Support Vector Machines'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
lr = KNeighborsClassifier()
lr.fit(X, y)
scores['KNN'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
lr = GaussianNB()
lr.fit(X, y)
scores['Naive Bayes'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
lr = Perceptron()
lr.fit(X, y)
scores['Perceptron'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
lr = LinearSVC()
lr.fit(X, y)
scores['Linear Support Vector Machines'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
lr = SGDClassifier()
lr.fit(X, y)
scores['Stochastic Gradient Descent'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
lr = DecisionTreeClassifier()
lr.fit(X, y)
scores['Decision Tree'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
lr = RandomForestClassifier()
lr.fit(X, y)
scores['Random Forest'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
models = scores.reset_index()
models.columns = ['model', 'score']
models = models.sort_values(by='score', ascending=False)
models
| model | score | |
|---|---|---|
| 8 | Random Forest | 81.82 |
| 0 | LogisticRegression | 80.59 |
| 3 | Naive Bayes | 80.59 |
| 7 | Decision Tree | 79.13 |
| 6 | Stochastic Gradient Descent | 77.1 |
| 4 | Perceptron | 72.72 |
| 2 | KNN | 70.94 |
| 1 | Support Vector Machines | 70.5 |
| 5 | Linear Support Vector Machines | 66.77 |